
***************************************************************************************************
******************************Minor do file: school stability and VA*********************************
***************************************************************************************************

*This .do file create two school-level varaibles: School stability rate and school VA

*Inputs: the main analysis dataset
*1. "VA_demographics.dta"
*2. "VA_test_scores.dta"

*Outputs (in order):
*1. "school_stability_rate.dta"
*2. "school_VA.dta"
***************************************************************************************************
***************************************************************************************************
***************************************************************************************************

*First calculate school stability rate
clear all
set more off
use "/data_analysis/Eliso_Complete/Data/VA_demographics.dta"
*School stability rate which is the percentage of students from the October membership count who are still present in the second semester (February count). Will do for prior 3 years: 2013, 2014, 2015
keep if year>=2003 & year<=2016
egen denom=count(locationcode), by(locationcode year)
gen numer1=1 if locationcode!=prlprefcode_spring
egen numer=count(numer1), by(locationcode year)

gen stability_rate=(denom-numer)/denom
su stability_rate
collapse stability_rate, by(locationcode year)
compress
save "/data_analysis/Eliso_Complete/Data/school_stability_rate.dta", replace



*now, calculate the school VA
clear all
set more off
use "/data_analysis/Eliso_Complete/Data/VA_demographics.dta"
*Value-added. Will do for prior 3 years with valid test data: 2011, 2012, 2013 (recall missing 2014 test scores)
keep if year>=2011 & year<=2013
*Merge in test scores
merge 1:1 stdpseudoid year using "/data_analysis/Eliso_Complete/Data/VA_test_scores.dta"
keep if _merge==3
drop _merge
*Use testing grade; just use grade via demos if missing (Note testing_grade and grade are identical in RD sample)
ren grade grade2
gen grade=testing_grade
replace grade=grade2 if grade==.
drop grade2

*Can't do grade 2 as no pre score
drop if grade==2
*Also, standardized math test only available to grade 7, so drop if grade>5
drop if grade>7

*Drop if uncertain school assignment (i.e., switches school during year)
drop if locationcode!=prlprefcode_spring

*Covariates first: 
*For pared, assign a missing categoy:
foreach var of varlist pared{
replace `var'=9 if `var'==.
}
*Fix one missing age covar
su age if year==2013 & grade==3
replace age=r(mean) if age==.
*Make ellcode numeric
encode ellevelcode, gen(el_code)
drop ellevelcode
ren el_code ellevelcode

local demo_control "i.ellevelcode i.gender i.frl i.ethnicity i.pared i.repeat_grade i.grade#c.age"

egen school_grade_year=group(locationcode grade year)

*Also include school-grade means of the covariates;
foreach var of varlist ellevelcode gender frl ethnicity pared repeat_grade  {
tab `var', gen(`var'_dummy)
}
drop ellevelcode_dummy6 gender_dummy2 frl_dummy3 ethnicity_dummy7 pared_dummy6 repeat_grade_dummy2

foreach var of varlist ellevelcode_dummy1 ellevelcode_dummy2 ellevelcode_dummy3 ellevelcode_dummy4 ellevelcode_dummy5 gender_dummy1 frl_dummy1 frl_dummy2 ethnicity_dummy1 ethnicity_dummy2 ethnicity_dummy3 ethnicity_dummy4 ethnicity_dummy5 ethnicity_dummy6 pared_dummy1 pared_dummy2 pared_dummy3 pared_dummy4 pared_dummy5 repeat_grade_dummy1 {
di "Var is `var'"
egen sg_`var'=mean(`var'), by(school_grade_year)
}
drop ellevelcode_dummy1 ellevelcode_dummy2 ellevelcode_dummy3 ellevelcode_dummy4 ellevelcode_dummy5 gender_dummy1 frl_dummy1 frl_dummy2 ethnicity_dummy1 ethnicity_dummy2 ethnicity_dummy3 ethnicity_dummy4 ethnicity_dummy5 ethnicity_dummy6 pared_dummy1 pared_dummy2 pared_dummy3 pared_dummy4 pared_dummy5 repeat_grade_dummy1

local demo_control_sg "sg_ellevelcode_dummy1 sg_ellevelcode_dummy2 sg_ellevelcode_dummy3 sg_ellevelcode_dummy4 sg_ellevelcode_dummy5 sg_gender_dummy1 sg_frl_dummy1 sg_frl_dummy2 sg_ethnicity_dummy1 sg_ethnicity_dummy2 sg_ethnicity_dummy3 sg_ethnicity_dummy4 sg_ethnicity_dummy5 sg_ethnicity_dummy6 sg_pared_dummy1 sg_pared_dummy2 sg_pared_dummy3 sg_pared_dummy4 sg_pared_dummy5 sg_repeat_grade_dummy1"

compress
*(i) cubics in school-grade means of prior-year test scores in math and English each interacted with grade
foreach var of varlist lag_math lag_ela {
egen sg_`var'=mean(`var'), by(school_grade_year)
}
local test_control_class_sg "i.grade#c.sg_lag_math i.grade#c.sg_lag_math#c.sg_lag_math i.grade#c.sg_lag_math#c.sg_lag_math#c.sg_lag_math i.grade#c.sg_lag_ela i.grade#c.sg_lag_ela#c.sg_lag_ela i.grade#c.sg_lag_ela#c.sg_lag_ela#c.sg_lag_ela"

*Calc school-grade size
egen sg_size=count(stdpseudoid), by(school_grade_year)

*Need valid math and lagged math score to use:
drop if math_scorez==. | lag_math==.

*Deal with English missing. From Chetty: "When prior test scores in the other subject are missing, we set the other subject prior score to zero and include an indicator for
*missing data in the other subject interacted with the controls for prior own-subject test scores."
gen missing_ela=(lag_ela==.)
replace lag_ela=0 if lag_ela==.

*Now run value-added
local test_control "i.grade#c.lag_math i.grade#c.lag_math#c.lag_math i.grade#c.lag_math#c.lag_math#c.lag_math i.grade#c.lag_ela i.grade#c.lag_ela#c.lag_ela i.grade#c.lag_ela#c.lag_ela#c.lag_ela"
local missing_read_control "i.missing_ela#i.grade#c.lag_math i.missing_ela#i.grade#c.lag_math#c.lag_math i.missing_ela#i.grade#c.lag_math#c.lag_math#c.lag_math"
local test_control_sg "i.grade#c.sg_lag_math i.grade#c.sg_lag_math#c.sg_lag_math i.grade#c.sg_lag_math#c.sg_lag_math#c.sg_lag_math i.grade#c.sg_lag_ela i.grade#c.sg_lag_ela#c.sg_lag_ela i.grade#c.sg_lag_ela#c.sg_lag_ela#c.sg_lag_ela"

local demo_control "i.ellevelcode i.gender i.frl i.ethnicity i.pared i.repeat_grade i.grade#c.age"
local demo_control_sg "sg_ellevelcode_dummy1 sg_ellevelcode_dummy2 sg_ellevelcode_dummy3 sg_ellevelcode_dummy4 sg_ellevelcode_dummy5 sg_gender_dummy1 sg_frl_dummy1 sg_frl_dummy2 sg_ethnicity_dummy1 sg_ethnicity_dummy2 sg_ethnicity_dummy3 sg_ethnicity_dummy4 sg_ethnicity_dummy5 sg_ethnicity_dummy6 sg_pared_dummy1 sg_pared_dummy2 sg_pared_dummy3 sg_pared_dummy4 sg_pared_dummy5 sg_repeat_grade_dummy1"

*Calc effective school-grade size
egen school_year=group(locationcode year)
egen effective_school_size=count(stdpseudoid), by(school_year)
qui compress

*Run main VA regression
areg math_scorez sg_size `test_control' `missing_read_control' `test_control_sg' `demo_control' `demo_control_sg' i.grade i.year, absorb(locationcode) 

* Stata dresidual after areg results in y - xb
* This is Rijt = alpha_j + eps_ijt in our notes (note Stata areg constrains mean of FE to be 0)
predict StudDresid, dresidual
sum StudDresid
keep if e(sample)==1

* Use dresidual to estimate var(alpha) + var(epsilon)
qui sum StudDresid
gen VarTotal = r(sd)^2

* Mean school-year residuals (average student residual for each school-year)
* This is bar(Ajt) in our notes
egen StudTVA = mean(StudDresid), by(school_year)

* Compute var epsilon component 
gen VarEpsMLEComp = (StudDresid - StudTVA)^2

* This is sum(Rijt - Rjt)^2 in notes (school-year level)
egen VarEpsMLECompSum = sum(VarEpsMLEComp), by(school_year)

* Collapse data at school-year level
collapse (mean) StudTVA effective_school_size VarEpsMLECompSum VarTotal, by(locationcode year school_year)

* Compute MLE School FE estimate
* MLE Denom. (Total students taught by teacher over career)
egen effective_school_sizeSum = sum(effective_school_size), by(locationcode)
* MLE Numerator 
gen StudTVAClass = StudTVA*effective_school_size
egen StudTVAProd = sum(StudTVAClass), by(locationcode)
* MLE FE (class size weighted bar(Ajt))
gen StudTVAMLE = StudTVAProd/effective_school_sizeSum

keep locationcode StudTVA effective_school_size year StudTVAMLE VarEpsMLECompSum VarTotal
ren *, lower

*Yj is "studtvag4mle" in the data
*To manually calculate use following code:
gen yj_num1=(studtva*effective_school_size)
egen yj_num=sum(yj_num1), by(locationcode)
egen yj_denom=sum(effective_school_size), by(locationcode)
gen yj=yj_num/yj_denom

*MLE Parameters: To find \sigma_\alpha collapse data by teachid and then summ yj
preserve
collapse yj, by(locationcode)
su yj
local sigma=r(sd)
restore
*Note will make sigmas represent variances (rather than sd)
gen sigma_a=(`sigma')^2
*Then \sigma^2_\epsilon is:
gen sigma_e=vartotal-sigma_a

gen peb=yj*(sigma_a/(sigma_a+(sigma_e/yj_denom)))

ren studtva school_yearFE
ren effective_school_size total_VA_enroll
ren peb school_VA

keep locationcode year school_yearFE school_VA total_VA_enroll

order locationcode year school_VA school_yearFE total_VA_enroll
qui compress

*Collapse to school level
collapse school_VA, by(locationcode)
save "/data_analysis/Eliso_Complete/Data/school_VA.dta", replace
